!jupyter nbconvert gapminder_clean-v4.ipynb --to html
[NbConvertApp] Converting notebook gapminder_clean-v4.ipynb to html [NbConvertApp] Writing 4467851 bytes to gapminder_clean-v4.html
import pandas as pd
import plotly.express as px
from scipy.stats import pearsonr, mannwhitneyu, normaltest
from scipy import stats
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from statannotations.Annotator import Annotator
from functools import reduce
df = pd.read_csv('gapminder_clean.csv', index_col = 0)
df.head()
| Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 1962 | NaN | 0.073781 | 21.276422 | NaN | NaN | 4.878051 | 7.450 | NaN | 9.349593 | NaN | NaN | 33.219902 | 14.312061 | NaN | 10267083.0 | Asia | 853.100710 |
| 1 | Afghanistan | 1967 | NaN | 0.123782 | 9.917662 | NaN | NaN | 6.772908 | 7.450 | NaN | 14.209827 | NaN | NaN | 35.389415 | 15.881812 | NaN | 11537966.0 | Asia | 836.197138 |
| 2 | Afghanistan | 1972 | NaN | 0.130820 | 18.880833 | NaN | NaN | 14.763231 | 7.450 | NaN | 18.105850 | NaN | NaN | 37.610146 | 17.947027 | NaN | 13079460.0 | Asia | 739.981106 |
| 3 | Afghanistan | 1977 | NaN | 0.183118 | 13.836822 | NaN | NaN | 11.662904 | 7.449 | NaN | 14.823175 | NaN | NaN | 40.110146 | 19.998926 | NaN | 14880372.0 | Asia | 786.113360 |
| 4 | Afghanistan | 1982 | NaN | 0.165879 | NaN | NaN | NaN | NaN | 7.450 | NaN | NaN | NaN | NaN | 43.230732 | 19.402324 | NaN | 12881816.0 | Asia | 978.011439 |
df_1962 = df[df['Year'] == 1962] #filter the data to include only rows where Year is 1962
plt.figure(figsize = [12,8])
splot = sns.scatterplot(data=df_1962, x='CO2 emissions (metric tons per capita)', y="gdpPercap")
splot.set_xscale("log")
splot.set_yscale("log")
plt.title('Comparing CO2 and gdpPercap in 1962');
# remove missing data
new = df_1962.dropna(subset=['CO2 emissions (metric tons per capita)','gdpPercap'])
# try to fit the assumptions of Pearson correlation: remove outliers
new = new.loc[new['CO2 emissions (metric tons per capita)'] < 40]
a,b = pearsonr(new['CO2 emissions (metric tons per capita)'],new["gdpPercap"])
print('Pearson R value:',f'{a:.3f}')
print('Associated p value', np.format_float_scientific(b, precision = 3, exp_digits=3))
Pearson R value: 0.806 Associated p value 1.082e-025
# get a list of all years
years_list = list(df['Year'].unique())
#print(years_list)
# remove missing data
df_not_na = df.dropna(subset=['CO2 emissions (metric tons per capita)','gdpPercap'])
# try to fit the assumptions of Pearson correlation: remove outliers
df_1 = df_not_na.loc[df_not_na['CO2 emissions (metric tons per capita)'] > 0.005]
# create a dataframe showing the strongest correlation
my_dict = {}
for i in years_list:
df_2 = df_1[df_1['Year'] == i]
x = pearsonr(df_2['CO2 emissions (metric tons per capita)'],df_2["gdpPercap"])
my_dict[i] = x
#print(my_dict)
df_3 = pd.DataFrame(my_dict).transpose()
df_3.columns = ['Pearson R value','Associated p value']
df_3.sort_values(by=['Associated p value'])
| Pearson R value | Associated p value | |
|---|---|---|
| 1967 | 0.938792 | 3.397143e-53 |
| 1962 | 0.926082 | 1.128679e-46 |
| 1972 | 0.842899 | 1.824292e-32 |
| 1997 | 0.808140 | 7.976156e-30 |
| 1992 | 0.808434 | 3.640308e-29 |
| 2002 | 0.800642 | 3.863564e-29 |
| 1982 | 0.816638 | 5.565916e-29 |
| 1987 | 0.809553 | 3.899627e-28 |
| 1977 | 0.792834 | 2.838892e-26 |
| 2007 | 0.720417 | 9.232747e-22 |
# remove missing data
df_not_na = df.dropna(subset=['CO2 emissions (metric tons per capita)','gdpPercap'])
# filter the data to include only rows where Year is 1967
df_1967 = df_not_na[df_not_na['Year'] == 1967]
fig = px.scatter(df_1967, x='CO2 emissions (metric tons per capita)', y="gdpPercap",
color="continent", size='pop',log_x=True, log_y=True, hover_data=["Country Name"])
fig.show()
Europe has significantly more energy use than the Americas after 1970.
df_energy = df[['Year','Energy use (kg of oil equivalent per capita)','continent']]
df_energy = df_energy.dropna()
# year 1962
df_1962_energy = df_energy[df_energy['Year']==1962]
pairs = [('Europe', 'Oceania'),
('Europe', 'Americas'),
('Europe', 'Asia'),
('Americas', 'Oceania'),
('Asia', 'Oceania'),
('Asia', 'Americas')]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1962_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1962_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1962_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1962')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Oceania vs. Europe: Mann-Whitney-Wilcoxon test two-sided, P_val:3.689e-01 U_stat=2.700e+01
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.666e-02 U_stat=0.000e+00
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.403e-01 U_stat=2.000e+00
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:3.859e-01 U_stat=1.500e+01
Oceania vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.453e-01 U_stat=0.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.403e-01 U_stat=2.000e+00
# year 1967
df_1967_energy = df_energy[df_energy['Year']==1967]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1967_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1967_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1967_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1967')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Oceania vs. Europe: Mann-Whitney-Wilcoxon test two-sided, P_val:3.317e-01 U_stat=2.900e+01
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.594e-02 U_stat=0.000e+00
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.403e-01 U_stat=2.000e+00
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.633e-01 U_stat=1.400e+01
Oceania vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.453e-01 U_stat=0.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.403e-01 U_stat=2.000e+00
# year 1972
df_1972_energy = df_energy[df_energy['Year']==1972]
pairs = [('Europe', 'Oceania'),
('Europe', 'Americas'),
('Europe', 'Asia'),
('Americas', 'Oceania'),
('Asia', 'Oceania'),
('Asia', 'Americas'),
('Europe', 'Africa'),
('Africa','Americas'),
('Africa','Oceania'),
('Africa','Asia')]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1972_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1972_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1972_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1972')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:7.962e-02 U_stat=5.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:8.482e-02 U_stat=3.900e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:6.494e-08 U_stat=5.290e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:3.570e-02 U_stat=1.600e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:5.233e-05 U_stat=4.840e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:3.775e-02 U_stat=3.450e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.473e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:7.459e-01 U_stat=2.100e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.186e-01 U_stat=2.700e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:2.316e-05 U_stat=4.740e+02
# year 1977
df_1977_energy = df_energy[df_energy['Year']==1977]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1977_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1977_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1977_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1977')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:6.387e-02 U_stat=4.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:6.758e-02 U_stat=4.000e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:4.536e-08 U_stat=5.320e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:7.132e-03 U_stat=1.340e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:1.766e-05 U_stat=4.960e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.073e-02 U_stat=3.280e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.473e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:4.313e-01 U_stat=1.600e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:7.513e-01 U_stat=2.280e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.914e-05 U_stat=4.760e+02
# year 1982
df_1982_energy = df_energy[df_energy['Year']==1982]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1982_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1982_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1982_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1982')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:7.962e-02 U_stat=5.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:8.482e-02 U_stat=3.900e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:5.709e-08 U_stat=5.510e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:1.305e-02 U_stat=1.510e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.125e-05 U_stat=4.940e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:4.072e-01 U_stat=2.900e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.421e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:4.313e-01 U_stat=1.600e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.474e-01 U_stat=2.260e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.090e-04 U_stat=4.570e+02
# year 1987
df_1987_energy = df_energy[df_energy['Year']==1987]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1987_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1987_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1987_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1987')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:7.962e-02 U_stat=5.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:7.962e-02 U_stat=4.100e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:2.516e-08 U_stat=5.580e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:3.891e-02 U_stat=1.700e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:1.334e-05 U_stat=4.990e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.125e-01 U_stat=2.700e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.421e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:5.473e-01 U_stat=1.800e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:2.355e-01 U_stat=2.100e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.137e-04 U_stat=4.750e+02
# year 1992
df_1992_energy = df_energy[df_energy['Year']==1992]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1992_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1992_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1992_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1992')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:9.838e-02 U_stat=6.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:6.758e-02 U_stat=4.000e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:1.555e-08 U_stat=6.890e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:1.575e-02 U_stat=1.700e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:3.675e-05 U_stat=5.580e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.909e-01 U_stat=2.520e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.331e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.436e-01 U_stat=1.400e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.222e-01 U_stat=2.020e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.973e-04 U_stat=5.000e+02
# year 1997
df_1997_energy = df_energy[df_energy['Year']==1997]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1997_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1997_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_1997_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 1997')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:7.962e-02 U_stat=5.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.124e-01 U_stat=4.100e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:4.209e-08 U_stat=6.790e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:3.615e-03 U_stat=1.460e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:5.462e-05 U_stat=5.530e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.237e-01 U_stat=2.710e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.331e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:1.594e-01 U_stat=1.100e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:5.876e-02 U_stat=2.050e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:2.456e-03 U_stat=5.180e+02
# year 2002
df_2002_energy = df_energy[df_energy['Year']==2002]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_2002_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_2002_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_2002_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 2002')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:9.838e-02 U_stat=6.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.219e-02 U_stat=4.200e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:4.679e-09 U_stat=7.250e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:4.307e-03 U_stat=1.560e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.661e-05 U_stat=5.620e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.406e-01 U_stat=2.720e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.292e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:1.169e-01 U_stat=9.000e+00
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.648e-02 U_stat=1.880e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.820e-03 U_stat=5.230e+02
# year 2007
df_2007_energy = df_energy[df_energy['Year']==2007]
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(10,6))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_2007_energy)
annotator = Annotator(ax, pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_2007_energy)
annotator.new_plot(ax, pairs=pairs, x='continent', y='Energy use (kg of oil equivalent per capita)', data=df_2007_energy)
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.title('Year 2007')
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
Americas vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:9.838e-02 U_stat=6.000e+00
Oceania vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:1.358e-01 U_stat=4.000e+01
Europe vs. Africa: Mann-Whitney-Wilcoxon test two-sided, P_val:3.643e-10 U_stat=9.510e+02
Africa vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:5.319e-04 U_stat=1.710e+02
Europe vs. Americas: Mann-Whitney-Wilcoxon test two-sided, P_val:2.193e-05 U_stat=5.820e+02
Americas vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:9.915e-01 U_stat=2.760e+02
Africa vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:2.091e-02 U_stat=0.000e+00
Europe vs. Oceania: Mann-Whitney-Wilcoxon test two-sided, P_val:1.290e-01 U_stat=1.000e+01
Africa vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:2.716e-03 U_stat=2.100e+02
Europe vs. Asia: Mann-Whitney-Wilcoxon test two-sided, P_val:3.355e-03 U_stat=5.290e+02
No significant difference.
# subset dataframe
df_after1990 = df[(df['Year']>1990) & (df['continent'].isin(['Europe','Asia']))]
df_import = df_after1990[['Year','Imports of goods and services (% of GDP)','continent']]
df_import = df_import.dropna()
# plotting
pairs = (
[(1992, 'Europe'), (1992, 'Asia')],
[(1997, 'Europe'), (1997, 'Asia')],
[(2002, 'Europe'), (2002, 'Asia')],
[(2007, 'Europe'), (2007, 'Asia')]
)
with sns.plotting_context('notebook', font_scale = 1):
fig, ax = plt.subplots(1, 1, figsize=(12,8))
getattr(ax, "set_yscale")("log")
sns.boxplot(x='Year', y='Imports of goods and services (% of GDP)', data=df_import, hue = 'continent')
#annotator = Annotator(ax, pairs, x='Year', y='Imports of goods and services (% of GDP)', data=df_import,hue = 'continent')
annotator.new_plot(ax, pairs=pairs, x='Year', y='Imports of goods and services (% of GDP)', data=df_import,hue = 'continent')
annotator.configure(test='Mann-Whitney', verbose=True).apply_and_annotate()
plt.show()
p-value annotation legend:
ns: p <= 1.00e+00
*: 1.00e-02 < p <= 5.00e-02
**: 1.00e-03 < p <= 1.00e-02
***: 1.00e-04 < p <= 1.00e-03
****: p <= 1.00e-04
1997_Asia vs. 1997_Europe: Mann-Whitney-Wilcoxon test two-sided, P_val:3.301e-01 U_stat=4.030e+02
1992_Asia vs. 1992_Europe: Mann-Whitney-Wilcoxon test two-sided, P_val:4.884e-01 U_stat=3.080e+02
2002_Asia vs. 2002_Europe: Mann-Whitney-Wilcoxon test two-sided, P_val:9.019e-01 U_stat=3.820e+02
2007_Asia vs. 2007_Europe: Mann-Whitney-Wilcoxon test two-sided, P_val:3.975e-01 U_stat=3.380e+02
Macao SAR, China and Monaco
# subset dataframe
df_pop = df[['Country Name','Population density (people per sq. km of land area)','Year']]
df_pop = df_pop.dropna()
# create a list of dataframe
df_0 = df_1 = df_2 = df_3 = df_4 = df_5 = df_6 = df_7 = df_8 = df_9 = pd.DataFrame()
df_list = [df_0 , df_1 , df_2 , df_3 , df_4 , df_5 , df_6 , df_7 , df_8 , df_9]
# get a list of all years
years_list = list(df_pop['Year'].unique())
# create merged dataframe
count = 0
for i in years_list:
df_list[count] = df_pop[df_pop['Year'] == i].set_index('Country Name')
df_list[count][str(i)+'_rank'] = df_list[count]['Population density (people per sq. km of land area)'].rank()
count += 1
df_t = reduce(lambda left,right: pd.merge(left,right,on=['Country Name'],how='outer'), df_list)
# add new column for average rank
df_t['temp'] = [0] * 262
count = 0
for i in years_list:
df_t['temp'] += df_t[str(i)+'_rank']
df_t['Average rank'] = df_t['temp']/10
df_total = df_t[['Average rank']].dropna()
# sort
df_total.sort_values(by=['Average rank'],ascending=False)
| Average rank | |
|---|---|
| Country Name | |
| Monaco | 255.3 |
| Macao SAR, China | 255.3 |
| Hong Kong SAR, China | 253.7 |
| Singapore | 252.9 |
| Gibraltar | 251.8 |
| ... | ... |
| Australia | 5.9 |
| Mauritania | 4.5 |
| Namibia | 3.0 |
| Mongolia | 2.0 |
| Greenland | 1.0 |
252 rows × 1 columns
Maldives
# subset and create dataframe
df_life = df[(df['Year'] == 1962) | (df['Year'] == 2007)]
df_life = df_life[['Country Name', 'Year', 'Life expectancy at birth, total (years)']]
df_life = df_life.dropna()
df_1962 = df_life[df_life['Year']==1962]
df_1962 = df_1962.set_index('Country Name')
df_2007 = df_life[df_life['Year']==2007]
df_2007 = df_2007.set_index('Country Name')
# merge dataframe
mergedDf = df_1962.merge(df_2007, left_index=True, right_index=True)
mergedDf = mergedDf.drop(['Year_x','Year_y'],axis = 1)
mergedDf.columns = ['Life expectancy 1962', 'Life expectancy 2007']
mergedDf['Increase'] = mergedDf['Life expectancy 2007'] - mergedDf['Life expectancy 1962']
# sort
mergedDf.sort_values(by=['Increase'],ascending=False)
| Life expectancy 1962 | Life expectancy 2007 | Increase | |
|---|---|---|---|
| Country Name | |||
| Maldives | 38.483561 | 75.399707 | 36.916146 |
| Bhutan | 33.094146 | 66.293098 | 33.198951 |
| Timor-Leste | 34.739049 | 65.824195 | 31.085146 |
| Tunisia | 43.341683 | 74.202439 | 30.860756 |
| Oman | 44.300512 | 75.123610 | 30.823098 |
| ... | ... | ... | ... |
| Belarus | 68.635829 | 70.203415 | 1.567585 |
| Russian Federation | 67.021415 | 67.497561 | 0.476146 |
| Ukraine | 69.146098 | 68.222195 | -0.923902 |
| Lesotho | 47.402244 | 44.882220 | -2.520024 |
| Zimbabwe | 52.277902 | 44.177756 | -8.100146 |
236 rows × 3 columns